/*** CONSTRUCTING THE SAMPLE FOR CALCULATING ABILITY GROUPING MEASURES ***/

clear all
set more 1

do "paths.do"

capture log close
log using "$LOGFILES\tracking_6.log", replace

/*
Run time: ~ 8 hours
*/

** Read in course names to determine over-level and under-level courses
import excel "E:\projects\107-Tracking\NewFilesReleased\CourseNames\CourseNames.xlsx", sheet("courses") firstrow clear
keep Sservicex serv_consol over8 under8
save "$WORKING/tracking_6_serv_consol.dta", replace

/*
 * Get physical districts for charter schools
 * For many calculations, we use physical/geographic districts rather than
 * administrative districts. While administrative districts are "hierarchical"
 * in the sense that the campus code begins with the district code, that is
 * not the case for physical districts.
 */
import excel "$charterlocs\CharterCampusesByPhysicalDistrict-2020.xlsx", sheet("Sheet1") cellrange(A1:B893) firstrow clear

destring, replace
rename CAMPUS campus
rename GeoDistrict distnum
sort campus
by campus: gen tmp1 = _n
assert (tmp1 == 1)
drop tmp*

gen exp0 = 10
expand exp0
sort campus
by campus: gen year = _n + 2009
keep campus year distnum
rename distnum geo_distnum

save "$WORKING/tracking_6_geo_distnum.dta", replace



foreach ss in math ela sci soc {
	use "$WORKING\tracking_4-`ss'", clear

	*********************************************
	*** LIMIT TO FALL ATTENDEES IN GRADES 3-9 ***
	*********************************************

	** KEEP students x school that have matching attendance and Fall transcript records
	assert id1!="" & campus!="" 
	descr rctype*
	tab rctype rctypeflag, m
	egen tmp1=tag(id1 year)
	gen tmp2=(rctype==1 & rctypeflag==0)
	egen tmp3=max(tmp2), by(id1 year)
	tab year tmp3 if tmp1==1, row m
	keep if tmp2==1
	* note: given the way data were constructed, no student should have multi grades by campus-year
	* so, can use id x campus x grade x year interchanged with id x campus x year
	assert grade!=.
	egen tmp4=tag(id1 year)
	egen tmp5=tag(id1 campus year)
	egen tmp6=tag(id1 campus grade year)
	for num 4/6: count if tmpX==1
	drop rctype* tmp*

	** KEEP students x school attending in first grading period
	descr sixweek*
	assert sixweek1==0|sixweek1==1
	egen tmp1=tag(id1 campus grade year)
	tab year sixweek1, row m 
	tab year sixweek1 if tmp==1, row m 
	keep if sixweek1==1
	drop tmp*

	** DROP grades 10+
	* if characterize high school tracking, would only do using grade 9
	* first show shares taking various math courses in high school 
	tab grade, m
	tab Sservicex if grade>=9 & grade<=12 & Sservicex!="" & (S`ss'==1|Sgeneric==1), gen(SS)
	tabstat SS* if grade>=9, by(grade)
	for num 9/12: sum SS* if grade==X
	egen tmp1=tag(id1 campus grade year)
	tab grade year if tmp1==1, col m
	drop if grade>9
	drop SS* tmp*

	** DROP students who have neither math nor generic classes
	* find number of math and generic classes by student x campus x year
	egen tmp1=tag(id1 campus grade year)
	egen tmp2=sum(S`ss'), by (id1 campus grade year)
	egen tmp3=sum(Sgeneric), by (id1 campus grade year)
	tab tmp3 tmp2 if tmp1==1, m 
	gen Sno`ss'=(tmp2==0 & tmp3==0)
	la var Sno`ss' "student has no `ss' or generic courses, campus x grade"
	tab Soth Sno`ss', m
	tab year Sno`ss' if tmp1==1, row
	tab grade Sno`ss' if tmp1==1, row
	tabstat Pdis_* Pspedr Pbil_* Pesl_* Pvoced Pdelinqu Ct_* if tmp1==1, by(Sno`ss') varwidth(16)
	tabstat Pdis_* Pspedr Pbil_* Pesl_* Pvoced Pdelinqu Ct_* if Sno`ss'==1 & tmp1==1, by (grade) varwidth(16)
	* calculate number of students with no math or generic course by campus x grade
	egen tmp5=sum(Sno`ss') if tmp1==1, by(campus grade year)
	egen CGno`ss'=max(tmp5), by(campus grade year)
	la var CGno`ss' "# students w/no `ss' or generic courses, campus x grade"
	drop Soth tmp*
	
	** save intermediate dataset before dropping these students
	if ("`ss'" == "math") {
		save "$WORKING\tracking_6_NOMATH.dta", replace
	}
	
	** drop these students
	drop if Sno`ss'==1
	
	************************************************
	*** TEMPORARILY REMOVE NON-DUPLICATE RECORDS ***
	************************************************
		
	** identify whether there is more than one record per student
	sort id1 campus grade year
	by id1 campus grade year: gen tmp1 = _N
	gen Snodupmult = (tmp1 == 1)
	la var Snodupmult "unique course record, student x campus x grade x year"
	tab year Snodupmult
	drop tmp1
	
	** address class IDs that have students taking different subjects
	*** by forcing class IDs to be nested within subjects
	rename Sclass_id Sclass_id_orig
	la var Sclass_id_orig "original class ID"
	* pad new class id to max length (with leading 0s)
	gen tmp1 = strlen(Sclass_id_orig)
	sum tmp1
	gen Sclass_id = substr(("0" * (r(max))) + Sclass_id_orig, -r(max), .)	
	
	
	* determine number of distinct subjects within a class ID
	egen tmp2=tag(campus grade year Sclass_id Sservice)
	sort tmp2 campus grade year Sclass_id Sservice
	by tmp2 campus grade year Sclass_id: gen tmp3 = _n if tmp2
	egen tmp4 = min(tmp3), by(campus grade year Sclass_id Sservice)
	tab tmp4, m
	** create unique code to add (of common max length)
	gen tmp5 = string(tmp4)
	gen tmp6 = strlen(tmp5)
	sum tmp6
	gen tmp7 = substr(("0" * (r(max))) + tmp5, -r(max), .)
	replace Sclass_id = Sclass_id + tmp7
	la var Sclass_id "original class ID differentiated by subject"
	drop tmp*
	
	** number of students represented in tracking sample
	egen tmp1 = tag(id1 campus grade year)
	summ tmp1 if (tmp1 == 1)
	global Nstud = r(N)
	drop tmp*
	
	** create enrollment variables
	egen tmp1 = tag(id1 campus grade year)
	egen CGenrl = sum(tmp1), by(campus grade year)
	egen tmp2 = tag(id1 campus grade year Sservice)
	egen CGenrl_by_svc0 = sum(tmp2), by(campus grade year Sservice)
	egen tmp3 = tag(id1 campus grade year Sclass_id)
	egen CGenrl_by_cls0 = sum(tmp3), by(campus grade year Sclass_id)
	la var CGenrl "campus x grade enrollment, tracking sample"
	la var CGenrl_by_svc0 "campus x grade x Sservice enrollment, tracking samp orig"
	la var CGenrl_by_cls0 "campus x grade X class enrollment, tracking samp orig"
	drop tmp*
	
	** save datasets
	* full dataset
	save "$WORKING\temp\tracking_6_Snodupmult.dta", replace
	* students with unique records
	keep if Snodupmult
	save "$WORKING\temp\tracking_6_Snodupmult1.dta", replace
	* students with non-unique records
	use "$WORKING\temp\tracking_6_Snodupmult.dta", clear
	keep if !Snodupmult
	save "$WORKING\temp\tracking_6_Snodupmult0.dta", replace
	
	
	***************************************
	*** ADDRESS DUPLICATE CLASS RECORDS ***
	***************************************
	
	use "$WORKING\temp\tracking_6_Snodupmult0.dta", clear
	
	** these are duplicates by Sservice x Sclass_id (for a student x campus x year)
	* these appear to be cases where the transcripts record Fall grades, and then have
	* a duplicate entry recording either the Spring or cumulative grades
	
	** number of students w/ multiple class and/or course records
	egen tmp1 = tag(id1 campus grade year)
	summ tmp1 if tmp1 == 1
	global Nstud_dup = r(N)
	drop tmp*
	
	** check if any course variables are missing 
	descr S*
	assert (Sservice != "") & (Sclass_id != "") & (Scourse_seq != "") & (Sstudent_begin_date != .) & (Sstudent_end_date != .)

	** identify first term of year long courses and 1-term and year-long courses
	/* Scourse_seq:
	 0      = one semester course
	 1,2    = 1st and 2nd half of 2 semester course
	 3,4,5  = 1st, 2nd and 3rd part of 3-semester course
	 6-9    = fourths of 4-semester course
	 A      = non-HS year long course */
	tab Scourse_seq, m
	sort grade
	by grade: tab Scourse_seq, m
	gen Sfirst=0
	for any 1 3 6: replace Sfirst=1 if Scourse_seq=="X"
	summ Sfirst
	la var Sfirst "first-term course"
	gen Slast = !Sfirst
	replace Slast = 0.5 if Scourse_seq == "0"
	
	** identify course start and end dates (year, month, day)
	gen Sbegyr = year(Sstudent_begin_date)
	gen Sbegmo = month(Sstudent_begin_date)
	gen Sbegdy = day(Sstudent_begin_date)
	gen Sendyr = year(Sstudent_end_date)
	gen Sendmo = month(Sstudent_end_date)
	gen Senddy = day(Sstudent_end_date)
	gen Sbegyrmo = 100000 + ((1 + Sbegyr - year) * 1000) + Sbegmo
	gen Sendyrmo = 100000 + ((1 + Sendyr - year) * 1000) + Sendmo
	tab Sendyrmo Sbegyrmo, m
	
	** address duplicates by Sservice x Sclass_id (for a student x campus x year)
	* these appear to be cases where the transcripts record Fall grades, and then have
	* a duplicate entry recording either the Spring or cumulative grades
	
	** within the same class_id, keep class w/ earliest start and end dates, prioritizing 1st course w/in sequences
	sort id1 campus grade year Sclass_id Sbegyr Sbegmo Sbegdy Sendyr Sendmo Senddy Slast Scourse_seq
	by id1 campus grade year Sclass_id Sbegyr Sbegmo Sbegdy Sendyr Sendmo Senddy Slast Scourse_seq: gen tmp1 = _n
	assert tmp1 == 1
	by id1 campus grade year Sclass_id Sbegyr Sbegmo Sbegdy Sendyr Sendmo Senddy Slast: gen tmp2 = _n
	tab year tmp2
	assert tmp2 <= 2
	by id1 campus grade year Sclass_id Sbegyr Sbegmo Sbegdy Sendyr Sendmo Senddy: gen tmp3 = _n
	tab year tmp3 if (tmp2 == 1)
	assert tmp3 <= 2 if (tmp2 == 1)
	by id1 campus grade year Sclass_id Sbegyr Sbegmo Sbegdy: gen tmp4 = _n
	tab year tmp4 if (tmp3 == 1)
	assert tmp4 <= 2 if (tmp3 == 1)
	by id1 campus grade year Sclass_id: gen tmp5 = _n
	tab year tmp5 if (tmp4 == 1)
	assert tmp5 <= 2 if (tmp4 == 1)
	* KEEP unique class record
	* DROP duplicate type A
	keep if tmp5 == 1
	drop tmp*
	
	
	** address duplicates by Sservice (for a student x campus x year)
	* these are trickier to deal with due to the presence of generic (e.g., Grade 1) entries
	* keep all that share the earliest start and end dates, and earliest course sequence type
	sort id1 campus grade year Sservice Sbegyr Sbegmo Sbegdy Sendyr Sendmo Senddy Slast Scourse_seq
	by id1 campus grade year Sservice: gen tmp1 = _n
	tab tmp1 grade
	gen tmp2a = Sbegyrmo if (tmp1 == 1)
	gen tmp3a = Sendyrmo if (tmp1 == 1)
	gen tmp4a = Slast if (tmp1 == 1)
	for num 2/4: egen tmpX = max(tmpXa), by(id1 campus grade year Sservice)
	* DROP duplicate type B
	drop if (Sbegyrmo != tmp2) | (Sendyrmo != tmp3) | (Slast != tmp4)
	drop tmp*
	
	
	*************************************************
	*** ADDRESS REMAINING MULTIPLE COURSE RECORDS ***
	*************************************************
	
	** explore remaining duplicates with same service code, after initial cleaning
	duplicates tag id1 campus grade year Sservice, gen(dup1)
	tab dup1, m
	tab Sservicex if dup1!=0, m
	gen Sdupl=(dup1>0)
	egen Sdupl_any=max(Sdupl), by(id1 campus grade year)
	 la var Sdupl "student has dupl record for this course, campus x grade"
	 la var Sdupl_any "student has dupl record for any course, campus x grade"
	egen tmp1=tag(id1 campus grade year)
	tab Sdupl_any if tmp1==1
	tabstat Sdupl_any if tmp1==1, by(grade)
	drop dup* tmp*
	
	preserve
	collapse (sum) CGdup_by_svc=Sdupl_any, by(campus grade year Sservice)
	egen CGdup = sum(CGdup_by_svc), by(campus grade year)
	la var CGdup "# students w/dupl course records (same Sservice), campus x grade"
	la var CGdup_by_svc "# students w/dupl, campus x grade x Sservice"
	save "$WORKING\temp\tracking_6_CGdup.dta", replace
	restore
	
	** identify peers with same set of duplicate class IDs
	egen tmp1=group(Sclass_id) if Sdupl==1
	egen tmp2=min(tmp1), by(id1 campus grade year Sservice)
	egen tmp3=max(tmp1), by(id1 campus grade year Sservice) 
	egen double tmp4=sum(tmp1), by(id1 campus grade year Sservice)
	egen tmp5=group(campus grade year tmp2 tmp3 tmp4) if Sdupl==1
	* distribution of numbers of students linked together
	egen tmp6=tag(id1 campus grade year Sservice)
	egen Sdupl_mtch=sum(tmp6) if Sdupl==1, by(tmp5)
	tab Sdupl_mtch if Sdupl==1
	la var Sdupl_mtch "number of peers w/same set of dupl allocated to same class"
	tab Sdupl_mtch if Sdupl
	drop tmp*
	
	** keep largest class if has multiple records for the same course/subject
	gen tmp1 = -CGenrl_by_cls0
	sort id1 campus grade year Sservice tmp1 Sclass_id
	by id1 campus grade year Sservice tmp1: gen tmp2 = _n
	tab year tmp2
	by id1 campus grade year Sservice: gen tmp3 = _n
	tab year tmp3 if (tmp2 == 1)
	* DROP duplicate type C
	keep if tmp3 == 1
	drop tmp*

	********************************************************
	*** ADDRESS STUDENTS IN MULIPLE CORE SUBJECT COURSES ***
	********************************************************

	** identify these cases
	duplicates tag id1 campus grade year, gen(dup1)
	tab dup1, m
	tab Sservicex if dup1!=0, m
	gen Smult=(dup1>0)
	la var Smult "student has multiple `ss' courses, campus x grade"
	egen tmp1=tag(id1 campus grade year)
	summ tmp1 if tmp1==1
	assert r(N) == $Nstud_dup
	disp $Nstud_dup
	tabstat Smult if tmp1==1, by(grade)
	* create variable for number of peers w/ multiple math courses
	egen tmp2=sum(Smult) if tmp1==1, by(campus grade year)
	egen CGmult=max(tmp2), by(campus grade year)
	la var CGmult "# students w/ multiple `ss' courses, campus x grade"
	drop dup* tmp*
	
	** keep subject w/ largest class enrollment
	gen tmp1 = -CGenrl_by_cls0
	sort id1 campus grade year tmp1 Sclass_id
	by id1 campus grade year tmp1: gen tmp2 = _n
	tab year tmp2
	by id1 campus grade year: gen tmp3 = _n
	tab year tmp3 if (tmp2 == 1)
	* DROP duplicate type D
	keep if tmp3 == 1
	drop tmp*
	
	save "$WORKING\temp\tracking_6_Snodupmult2.dta", replace
	
	************************************************
	*** ADD BACK IN STUDENTS WITH UNIQUE RECORDS ***
	************************************************
	
	use "$WORKING\temp\tracking_6_Snodupmult2.dta", clear
	append using "$WORKING\temp\tracking_6_Snodupmult1.dta"
	duplicates tag id1 campus grade year, gen(dup1)
	assert dup1==0
	assert _N==$Nstud
	disp $Nstud
	drop dup*
	
	merge m:1 campus grade year Sservice using "$WORKING\temp\tracking_6_CGdup.dta"
	keep if _merge != 2
	drop _merge
	
	** replace missing values for new variables
	foreach vv in Sdupl Sdupl_any Sdupl_mtch Smult {
		replace `vv' = 0 if `vv' == .
	}
	foreach vv in CGmult CGdup CGenrl {
		egen tmp1 = min(`vv'), by(campus grade year)
		replace `vv' = tmp1
		replace `vv' = 0 if (`vv' == .)
		drop tmp*
	}
	foreach vv in CGdup_by_svc CGenrl_by_svc {
		egen tmp1 = min(`vv'), by(campus grade year Sservice)
		replace `vv' = tmp1
		replace `vv' = 0 if (`vv' == .)
		drop tmp*
	}
	
	
	gen one0 = 1
	egen cgy_sz = sum(one0), by(campus grade year)
	egen cgys_sz = sum(one0), by(campus grade year Sservice)
	egen cgysc_sz = sum(one0), by(campus grade year Sclass_id)
	assert cgy_sz == CGenrl
	assert cgys_sz <= CGenrl_by_svc0
	assert cgysc_sz <= CGenrl_by_cls0
	rename cgys_sz CGenrl_by_svc
	rename cgysc_sz CGenrl_by_cls
	la var CGenrl_by_svc
	la var CGenrl_by_svc0 "campus x grade x Sservice enrollment, tracking sample"
	la var CGenrl_by_cls0 "campus x grade X class enrollment, tracking sample"
	drop one0 cgy*
	
	
	************************************************
	*** CREATE Z-SCORES FROM TRACKING_1 MEAN/SDS ***
	************************************************
	
	** create variable for missing lagged math score
	gen missLM = (LMscore == .)
	egen CGmissLM = sum(missLM), by(campus grade year)
	la var missLM "missing lagged math score"
	la var CGmissLM "# stud missing lag math score, campus x grade, tracking samp"
	
	** merge in dataset with mean/sd by grade-year
	gen test_grade = grade - 1
	gen test_year = year - 1
	merge m:1 test_grade test_year using "$WORKING\tracking_1-meansd.dta"
	drop if _merge == 2
	assert (_merge == 3) | (grade == 9) | (grade == 3)
	drop _merge test_grade test_year
	la var score_mean "mean score on prev-grade prev-year test"
	la var score_sd "sd of scores on prev-grade prev-year test"
	
	gen zLMscore = (LMscore - score_mean) / score_sd
	la var zLMscore "normalized lagged `ss' score"

	gen tmp1 = (LMscore == .) | (zLMscore != .) | (grade == 9) | (grade == 3)
	assert tmp1
	drop tmp*
	
	** create variable for mean/std dev of lagged math score
	egen CGmnzLM=mean(zLMscore), by(campus grade year)
	egen CGsdzLM=sd(zLMscore), by(campus grade year)
	la var CGmnzLM "average lagged math z-score, campus x grade, tracking samp"
	la var CGsdzLM "std dev lagged math z-score, campus x grade, tracking samp"
	 
	foreach dd in 10 25 75 90 {
		egen CGdec`dd'zLM = pctile(zLMscore), p(`dd') by(campus grade year)
		la var CGdec`dd'zLM "percentile `dd' lagged math z-score, campus x grade, tracking samp"
	}
	
	if ("`ss'" == "math") {
		merge m:1 Sservicex using "$WORKING/tracking_6_serv_consol.dta"
		assert _merge == 3
		drop _merge
	}
	else {
		gen serv_consol = Sservicex
		gen over8 = .
		gen under8 = .
	}
	
	destring campus, replace
	merge m:1 campus year using "$WORKING/tracking_6_geo_distnum.dta"
	keep if (_merge != 2)
	drop _merge
	
	gen distnum = floor(campus / 1000)
	gen distnum0 = distnum
	replace distnum0 = geo_distnum if (Ct_chrtd == 1) | (Ct_chrtc == 1) | (Ct_chrtu == 1)
	
	save "$WORKING\tracking_6-`ss'", replace
}



** list of course names

use "$WORKING/tracking_6-math.dta", clear

keep Sservicex serv_consol grade
gen one0 = 1
collapse (sum) obs=one0, by(serv_consol Sservicex grade)
reshape wide obs i(serv_consol Sservicex) j(grade)
sort Sservicex serv_consol
order Sservicex obs* serv_consol

export delimited using "$OUTPUT/tracking_6-coursenames.csv", replace


log close
exit, clear




	
	
	